1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

Data from the speech features

1.2 The data set

TADPOLE_D1_D2 <- read.csv("~/GitHub/BSWiMS/Data/TADPOLE/TADPOLE_D1_D2.csv")
TADPOLE_D1_D2_Dict <- read.csv("~/GitHub/BSWiMS/Data/TADPOLE/TADPOLE_D1_D2_Dict.csv")
TADPOLE_D1_D2_Dict_LR <- as.data.frame(read_excel("~/GitHub/BSWiMS/Data/TADPOLE/TADPOLE_D1_D2_Dict_LR.xlsx",sheet = "LeftRightFeatures"))


rownames(TADPOLE_D1_D2_Dict) <- TADPOLE_D1_D2_Dict$FLDNAME

1.3 Conditioning the data


# mm3 to mm
isVolume <- c("Ventricles","Hippocampus","WholeBrain","Entorhinal","Fusiform","MidTemp","ICV",
              TADPOLE_D1_D2_Dict$FLDNAME[str_detect(TADPOLE_D1_D2_Dict$TEXT,"Volume")]
              )


#TADPOLE_D1_D2[,isVolume] <- apply(TADPOLE_D1_D2[,isVolume],2,'^',(1/3))
TADPOLE_D1_D2[,isVolume] <- TADPOLE_D1_D2[,isVolume]^(1/3)

# mm2 to mm
isArea <- TADPOLE_D1_D2_Dict$FLDNAME[str_detect(TADPOLE_D1_D2_Dict$TEXT,"Area")]
TADPOLE_D1_D2[,isArea] <- sqrt(TADPOLE_D1_D2[,isArea])

# Get only cross sectional measurements
FreeSurfersetCross <- str_detect(colnames(TADPOLE_D1_D2),"UCSFFSX")

# The subset of baseline measurements
baselineTadpole <- subset(TADPOLE_D1_D2,VISCODE=="bl")
table(baselineTadpole$DX)
                   Dementia Dementia to MCI             MCI MCI to Dementia 
          7             336               1             864               5 
  MCI to NL              NL       NL to MCI 
          2             521               1 
table(baselineTadpole$DX_bl)

AD CN EMCI LMCI SMC 342 417 310 562 106


rownames(baselineTadpole) <- baselineTadpole$PTID


validBaselineTadpole <- cbind(DX=baselineTadpole$DX_bl,
                                 AGE=baselineTadpole$AGE,
                                 Gender=1*(baselineTadpole$PTGENDER=="Female"),
                                 ADAS11=baselineTadpole$ADAS11,
                                 ADAS13=baselineTadpole$ADAS13,
                                 MMSE=baselineTadpole$MMSE,
                                 RAVLT_immediate=baselineTadpole$RAVLT_immediate,
                                 RAVLT_learning=baselineTadpole$RAVLT_learning,
                                 RAVLT_forgetting=baselineTadpole$RAVLT_forgetting,
                                 RAVLT_perc_forgetting=baselineTadpole$RAVLT_perc_forgetting,
                                 FAQ=baselineTadpole$FAQ,
                                 Ventricles=baselineTadpole$Ventricles,
                                 Hippocampus=baselineTadpole$Hippocampus,
                                 WholeBrain=baselineTadpole$WholeBrain,
                                 Entorhinal=baselineTadpole$Entorhinal,
                                 Fusiform=baselineTadpole$Fusiform,
                                 MidTemp=baselineTadpole$MidTemp,
                                 ICV=baselineTadpole$ICV,
                                 baselineTadpole[,FreeSurfersetCross])


LeftFields <- TADPOLE_D1_D2_Dict_LR$LFN
names(LeftFields) <- LeftFields
LeftFields <- LeftFields[LeftFields %in% colnames(validBaselineTadpole)]
RightFields <- TADPOLE_D1_D2_Dict_LR$RFN
names(RightFields) <- RightFields
RightFields <- RightFields[RightFields %in% colnames(validBaselineTadpole)]

## Normalize to ICV
validBaselineTadpole$Ventricles=validBaselineTadpole$Ventricles/validBaselineTadpole$ICV
validBaselineTadpole$Hippocampus=validBaselineTadpole$Hippocampus/validBaselineTadpole$ICV
validBaselineTadpole$WholeBrain=validBaselineTadpole$WholeBrain/validBaselineTadpole$ICV
validBaselineTadpole$Entorhinal=validBaselineTadpole$Entorhinal/validBaselineTadpole$ICV
validBaselineTadpole$Fusiform=validBaselineTadpole$Fusiform/validBaselineTadpole$ICV
validBaselineTadpole$MidTemp=validBaselineTadpole$MidTemp/validBaselineTadpole$ICV

leftData <- validBaselineTadpole[,LeftFields]/validBaselineTadpole$ICV
RightData <- validBaselineTadpole[,RightFields]/validBaselineTadpole$ICV

## get mean and relative difference 
meanLeftRight <- (leftData + RightData)/2
difLeftRight <- abs(leftData - RightData)
reldifLeftRight <- difLeftRight/meanLeftRight
colnames(meanLeftRight) <- paste("M",colnames(meanLeftRight),sep="_")
colnames(difLeftRight) <- paste("D",colnames(difLeftRight),sep="_")
colnames(reldifLeftRight) <- paste("RD",colnames(reldifLeftRight),sep="_")


validBaselineTadpole <- validBaselineTadpole[,!(colnames(validBaselineTadpole) %in% 
                                               c(LeftFields,RightFields))]
#validBaselineTadpole <- cbind(validBaselineTadpole,meanLeftRight,difLeftRight,reldifLeftRight)
validBaselineTadpole <- cbind(validBaselineTadpole,meanLeftRight,difLeftRight)

## Remove columns with too many NA more than %15 of NA
nacount <- apply(is.na(validBaselineTadpole),2,sum)/nrow(validBaselineTadpole) < 0.15
diagnose <- validBaselineTadpole$DX
pander::pander(table(diagnose))
AD CN EMCI LMCI SMC
342 417 310 562 106
validBaselineTadpole <- validBaselineTadpole[,nacount]
## Remove character columns
ischar <- sapply(validBaselineTadpole,class) == "character"
validBaselineTadpole <- validBaselineTadpole[,!ischar]
## Place back diagnose
validBaselineTadpole$DX <- diagnose


validBaselineTadpole <- validBaselineTadpole[complete.cases(validBaselineTadpole),]
ischar <- sapply(validBaselineTadpole,class) == "character"
validBaselineTadpole[,!ischar] <- sapply(validBaselineTadpole[,!ischar],as.numeric)

colnames(validBaselineTadpole) <- str_remove_all(colnames(validBaselineTadpole),"_UCSFFSX_11_02_15_UCSFFSX51_08_01_16")
colnames(validBaselineTadpole) <- str_replace_all(colnames(validBaselineTadpole)," ","_")
validBaselineTadpole$LONISID <- NULL
validBaselineTadpole$IMAGEUID <- NULL
validBaselineTadpole$LONIUID <- NULL

diagnose <- as.character(validBaselineTadpole$DX)
validBaselineTadpole$DX <- diagnose
pander::pander(table(validBaselineTadpole$DX))
AD CN EMCI LMCI SMC
245 359 272 444 93


validBaselineTadpole[validBaselineTadpole$DX %in% c("EMCI","LMCI"),"DX"] <- "MCI" 
validBaselineTadpole[validBaselineTadpole$DX %in% c("CN","SMC"),"DX"] <- "NL" 

pander::pander(table(validBaselineTadpole$DX))
AD MCI NL
245 716 452

1.4 Get the Time To Event on MCI Subjects


subjectsID <- rownames(validBaselineTadpole)
visitsID <- unique(TADPOLE_D1_D2$VISCODE)
baseDx <- TADPOLE_D1_D2[TADPOLE_D1_D2$VISCODE=="bl",c("PTID","DX","EXAMDATE")]
rownames(baseDx) <- baseDx$PTID 
baseDx <- baseDx[subjectsID,]
lastDx <- baseDx
toDementia <- baseDx
table(lastDx$DX)
   Dementia Dementia to MCI             MCI MCI to Dementia       MCI to NL 
        244               1             711               2               2 
         NL       NL to MCI 
        452               1 
hasDementia <- lastDx$PTID[str_detect(lastDx$DX,"Dementia")]


for (vid in visitsID)
{
  DxValue <- TADPOLE_D1_D2[TADPOLE_D1_D2$VISCODE==vid,c("PTID","DX","EXAMDATE")]
  rownames(DxValue) <- DxValue$PTID 
  DxValue <- DxValue[DxValue$PTID %in% subjectsID,]
  noDX <- DxValue$PTID[nchar(DxValue$DX) < 1]
  print(length(noDX))
  DxValue[noDX,] <- lastDx[noDX,]
  inLast <- lastDx$PTID[lastDx$PTID %in% DxValue$PTID]
  print(length(inLast))
  lastDx[inLast,] <- DxValue[inLast,]
  noDementia <- !(toDementia$PTID %in% hasDementia)
  toDementia[noDementia,] <- lastDx[noDementia,]
  hasDementia <- unique(c(hasDementia,lastDx$PTID[str_detect(lastDx$DX,"Dementia")]))
}

[1] 0 [1] 1413 [1] 2 [1] 1326 [1] 6 [1] 1218 [1] 23 [1] 1095 [1] 805 [1] 1058 [1] 29 [1] 710 [1] 20 [1] 212 [1] 14 [1] 167 [1] 32 [1] 553 [1] 25 [1] 298 [1] 18 [1] 130 [1] 667 [1] 667 [1] 112 [1] 112 [1] 176 [1] 176 [1] 177 [1] 177 [1] 625 [1] 625 [1] 251 [1] 251 [1] 159 [1] 159 [1] 7 [1] 7 [1] 17 [1] 99 [1] 9 [1] 63 [1] 1 [1] 1

table(lastDx$DX)
   Dementia Dementia to MCI             MCI MCI to Dementia       MCI to NL 
        428               2             463              80               7 
         NL  NL to Dementia       NL to MCI 
        406               1              26 
baseMCI <-baseDx$PTID[baseDx$DX == "MCI"]
lastDementia <- lastDx$PTID[str_detect(lastDx$DX,"Dementia")]
lastDementia2 <- toDementia$PTID[str_detect(toDementia$DX,"Dementia")]
lastNL <- lastDx$PTID[str_detect(lastDx$DX,"NL")]

MCIatBaseline <- baseDx[baseMCI,]
MCIatEvent <- toDementia[baseMCI,]
MCIatLast <- lastDx[baseMCI,]

MCIconverters <- MCIatBaseline[baseMCI %in% lastDementia,]
MCI_No_converters <- MCIatBaseline[!(baseMCI %in% MCIconverters$PTID),]
MCIconverters$TimeToEvent <- (as.Date(toDementia[MCIconverters$PTID,"EXAMDATE"]) 
                                   - as.Date(MCIconverters$EXAMDATE))

sum(MCIconverters$TimeToEvent ==0)

[1] 0



MCIconverters$AtEventDX <- MCIatEvent[MCIconverters$PTID,"DX"]
MCIconverters$LastDX <- MCIatLast[MCIconverters$PTID,"DX"]

MCI_No_converters$TimeToEvent <- (as.Date(lastDx[MCI_No_converters$PTID,"EXAMDATE"]) 
                                   - as.Date(MCI_No_converters$EXAMDATE))

MCI_No_converters$LastDX <- MCIatLast[MCI_No_converters$PTID,"DX"]

MCI_No_converters <- subset(MCI_No_converters,TimeToEvent > 0)

2 Prognosis MCI to AD Conversion

2.1 the set


MCIPrognosisIDs <- c(MCIconverters$PTID,MCI_No_converters$PTID)

TADPOLECrossMRI <- validBaselineTadpole[MCIPrognosisIDs,]
table(TADPOLECrossMRI$DX)

MCI 680

TADPOLECrossMRI$DX <- NULL
TADPOLECrossMRI$status <- 1*(rownames(TADPOLECrossMRI) %in% MCIconverters$PTID)
table(TADPOLECrossMRI$status)

0 1 436 244

2.1.0.1 Standarize the names for the reporting

studyName <- "TADPOLE"
dataframe <- TADPOLECrossMRI
outcome <- "status"

TopVariables <- 10

thro <- 0.60
cexheat = 0.15

2.2 Generaring the report

2.2.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")

2.2.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
680 327
pander::pander(table(dataframe[,outcome]))
0 1
436 244

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

2.2.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]
  
  iscontinous <- sapply(apply(dataframe,2,unique),length) > 5 ## Only variables with enough samples



dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData

2.3 The heatmap of the data

numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000


if (!largeSet)
{

  hm <- heatMaps(data=dataframeScaled[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

2.3.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.9996707

2.4 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 326 , Uni p: 0.008991707 , Uncorrelated Base: 152 , Outcome-Driven Size: 0 , Base Size: 152 
#> 
#> 
 1 <R=1.000,r=0.975,N=    6>, Top: 3( 1 )[ 1 : 3 Fa= 3 : 0.975 ]( 3 , 3 , 0 ),<|>Tot Used: 6 , Added: 3 , Zero Std: 0 , Max Cor: 0.952
#> 
 2 <R=0.952,r=0.951,N=    6>, Top: 1( 1 )[ 1 : 1 Fa= 4 : 0.951 ]( 1 , 1 , 3 ),<|>Tot Used: 8 , Added: 1 , Zero Std: 0 , Max Cor: 0.909
#> 
 3 <R=0.909,r=0.904,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 5 : 0.904 ]( 1 , 1 , 4 ),<|>Tot Used: 10 , Added: 1 , Zero Std: 0 , Max Cor: 0.901
#> 
 4 <R=0.901,r=0.850,N=   17>, Top: 6( 1 )[ 1 : 6 Fa= 10 : 0.850 ]( 6 , 8 , 5 ),<|>Tot Used: 23 , Added: 8 , Zero Std: 0 , Max Cor: 0.867
#> 
 5 <R=0.867,r=0.834,N=   17>, Top: 3( 1 )[ 1 : 3 Fa= 12 : 0.834 ]( 3 , 5 , 10 ),<|>Tot Used: 28 , Added: 5 , Zero Std: 0 , Max Cor: 0.829
#> 
 6 <R=0.829,r=0.814,N=   17>, Top: 4( 1 )[ 1 : 4 Fa= 15 : 0.814 ]( 4 , 7 , 12 ),<|>Tot Used: 37 , Added: 7 , Zero Std: 0 , Max Cor: 0.813
#> 
 7 <R=0.813,r=0.807,N=   17>, Top: 5( 1 )[ 1 : 5 Fa= 19 : 0.807 ]( 5 , 5 , 15 ),<|>Tot Used: 46 , Added: 5 , Zero Std: 0 , Max Cor: 0.862
#> 
 8 <R=0.862,r=0.831,N=   17>, Top: 2( 1 )[ 1 : 2 Fa= 21 : 0.831 ]( 2 , 2 , 19 ),<|>Tot Used: 48 , Added: 2 , Zero Std: 0 , Max Cor: 0.805
#> 
 9 <R=0.805,r=0.753,N=   59>, Top: 27( 1 )[ 1 : 27 Fa= 43 : 0.753 ]( 27 , 29 , 21 ),<|>Tot Used: 99 , Added: 29 , Zero Std: 0 , Max Cor: 0.913
#> 
 10 <R=0.913,r=0.806,N=   59>, Top: 4( 1 )[ 1 : 4 Fa= 46 : 0.806 ]( 4 , 4 , 43 ),<|>Tot Used: 102 , Added: 4 , Zero Std: 0 , Max Cor: 0.874
#> 
 11 <R=0.874,r=0.687,N=   83>, Top: 27( 1 )[ 1 : 27 Fa= 63 : 0.687 ]( 24 , 37 , 46 ),<|>Tot Used: 145 , Added: 37 , Zero Std: 0 , Max Cor: 0.953
#> 
 12 <R=0.953,r=0.726,N=   83>, Top: 9( 1 )[ 1 : 9 Fa= 70 : 0.726 ]( 9 , 9 , 63 ),<|>Tot Used: 151 , Added: 9 , Zero Std: 0 , Max Cor: 0.901
#> 
 13 <R=0.901,r=0.700,N=   83>, Top: 4( 1 )[ 1 : 4 Fa= 74 : 0.700 ]( 4 , 5 , 70 ),<|>Tot Used: 154 , Added: 5 , Zero Std: 0 , Max Cor: 0.698
#> 
 14 <R=0.698,r=0.600,N=   73>, Top: 19( 6 )[ 1 : 19 Fa= 85 : 0.600 ]( 18 , 33 , 74 ),<|>Tot Used: 182 , Added: 33 , Zero Std: 0 , Max Cor: 0.736
#> 
 15 <R=0.736,r=0.600,N=   73>, Top: 9( 1 )[ 1 : 9 Fa= 89 : 0.600 ]( 8 , 13 , 85 ),<|>Tot Used: 189 , Added: 13 , Zero Std: 0 , Max Cor: 0.858
#> 
 16 <R=0.858,r=0.600,N=   73>, Top: 4( 1 )[ 1 : 4 Fa= 92 : 0.600 ]( 4 , 5 , 89 ),<|>Tot Used: 190 , Added: 5 , Zero Std: 0 , Max Cor: 0.848
#> 
 17 <R=0.848,r=0.600,N=    8>, Top: 4( 1 )[ 1 : 4 Fa= 93 : 0.600 ]( 3 , 3 , 92 ),<|>Tot Used: 190 , Added: 3 , Zero Std: 0 , Max Cor: 0.853
#> 
 18 <R=0.853,r=0.600,N=    8>, Top: 1( 1 )[ 1 : 1 Fa= 93 : 0.600 ]( 1 , 1 , 93 ),<|>Tot Used: 190 , Added: 1 , Zero Std: 0 , Max Cor: 0.599
#> 
 19 <R=0.599,r=0.600,N=    0>
#> 
 [ 19 ], 0.5992682 Decor Dimension: 190 Nused: 190 . Cor to Base: 91 , ABase: 44 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

1377

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

416

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

1.01

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

0.742

2.4.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPSTM <- attr(DEdataframe,"UPSTM")
  
  gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

2.5 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

2.6 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.5992682

2.7 U-MAP Visualization of features

2.7.1 The UMAP based on LASSO on Raw Data


if (nrow(dataframe) < 1000)
{
  classes <- unique(dataframe[1:numsub,outcome])
  raincolors <- rainbow(length(classes))
  names(raincolors) <- classes
  datasetframe.umap = umap(scale(dataframe[1:numsub,varlist]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
  text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
}

2.7.2 The decorralted UMAP

if (nrow(dataframe) < 1000)
{

  datasetframe.umap = umap(scale(DEdataframe[1:numsub,varlistc]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
  text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
}

2.8 Univariate Analysis

2.8.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : M_ST24SA 200 : D_ST49TA 300 : D_ST47CV




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : M_ST24SA 200 : D_ST49TA 300 : La_D_ST47CV

2.8.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
ADAS13 20.7611 6.16923 14.0091 5.78970 0.03549 0.788
ADAS11 12.8635 4.56128 8.7155 3.84978 0.00264 0.761
FAQ 5.4631 4.90262 1.9266 2.98257 0.00000 0.756
M_ST40CV 0.1799 0.00875 0.1875 0.00763 0.28199 0.750
M_ST29SV 0.1253 0.00708 0.1321 0.00750 0.58088 0.745
M_ST12SV 0.0913 0.00535 0.0962 0.00550 0.50030 0.744
Hippocampus 0.1582 0.00886 0.1664 0.00945 0.44340 0.737
RAVLT_immediate 29.0205 7.69236 37.2798 10.92838 0.04406 0.728
M_ST24CV 0.0996 0.00800 0.1059 0.00706 0.04673 0.727
M_ST31CV 0.1910 0.00945 0.1986 0.00902 0.94566 0.717


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]

theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]

pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
ADAS11 12.8635 4.561284 8.7155 3.849783 2.64e-03 0.761
FAQ 5.4631 4.902619 1.9266 2.982574 0.00e+00 0.756
M_ST32CV 0.1786 0.008747 0.1853 0.007976 6.03e-01 0.711
M_ST57CV 0.1909 0.009194 0.1972 0.008877 5.58e-01 0.690
La_M_ST29SV 0.0337 0.005815 0.0376 0.005699 8.31e-01 0.684
M_ST129CV 0.1559 0.007028 0.1604 0.007074 6.68e-01 0.680
RAVLT_learning 3.1762 2.396347 4.6651 2.556538 1.35e-04 0.673
La_M_ST30SV -0.0058 0.010722 -0.0127 0.011405 9.50e-01 0.669
MMSE 26.9180 1.764255 27.9610 1.752723 2.78e-15 0.668
M_ST44CV 0.1062 0.005720 0.1096 0.006072 8.52e-01 0.658
La_M_ST11SV 0.0420 0.000658 0.0424 0.000695 8.29e-01 0.650
La_M_ST40CV 0.0579 0.006301 0.0610 0.006277 7.77e-01 0.638
La_ADAS13 2.6723 2.070109 1.7532 1.988479 4.56e-01 0.631

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))


theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2.32 131 0.401


allSigvars <- names(dc)



dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}

finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
ADAS13 NA 20.7611 6.169227 14.0091 5.789699 3.55e-02 0.788 0.788 NA
ADAS11 12.8635 4.561284 8.7155 3.849783 2.64e-03 0.761 0.761 2
FAQ 5.4631 4.902619 1.9266 2.982574 0.00e+00 0.756 0.756 NA
M_ST40CV NA 0.1799 0.008748 0.1875 0.007629 2.82e-01 0.750 0.750 NA
M_ST29SV NA 0.1253 0.007075 0.1321 0.007502 5.81e-01 0.745 0.745 NA
M_ST32CV 0.1786 0.008747 0.1853 0.007976 6.03e-01 0.711 0.711 2
M_ST57CV 0.1909 0.009194 0.1972 0.008877 5.58e-01 0.690 0.690 NA
La_M_ST29SV -0.862M_ST44CV + 1.000M_ST29SV 0.0337 0.005815 0.0376 0.005699 8.31e-01 0.684 0.745 3
M_ST30SV NA 0.0887 0.018183 0.0766 0.018707 1.29e-01 0.681 0.681 NA
M_ST129CV 0.1559 0.007028 0.1604 0.007074 6.68e-01 0.680 0.680 16
RAVLT_learning 3.1762 2.396347 4.6651 2.556538 1.35e-04 0.673 0.673 NA
La_M_ST30SV -0.319Ventricles + 1.000M_ST30SV -0.0058 0.010722 -0.0127 0.011405 9.50e-01 0.669 0.681 -1
MMSE 26.9180 1.764255 27.9610 1.752723 2.78e-15 0.668 0.668 NA
M_ST44CV 0.1062 0.005720 0.1096 0.006072 8.52e-01 0.658 0.658 3
La_M_ST11SV + 0.000ICV + 1.000M_ST11SV 0.0420 0.000658 0.0424 0.000695 8.29e-01 0.650 0.645 -1
M_ST11SV NA 0.0238 0.000880 0.0243 0.000954 5.68e-01 0.645 0.645 NA
La_M_ST40CV -0.683M_ST32CV + 1.000M_ST40CV 0.0579 0.006301 0.0610 0.006277 7.77e-01 0.638 0.750 -1
La_ADAS13 -1.406ADAS11 + 1.000ADAS13 2.6723 2.070109 1.7532 1.988479 4.56e-01 0.631 0.788 -1
Ventricles NA 0.2966 0.045981 0.2801 0.049256 3.12e-01 0.603 0.603 4
ICV NA 115.7974 4.383457 115.1393 3.915295 9.81e-01 0.540 0.540 3

2.9 Comparing IDeA vs PCA vs EFA

2.9.1 PCA

featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,tol=0.002)   #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous]) 
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

#pander::pander(pc$rotation)


PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])


  gplots::heatmap.2(abs(PCACor),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "PCA Correlation",
                    cexRow = 0.5,
                    cexCol = 0.5,
                     srtCol=45,
                     srtRow= -45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")

2.9.2 EFA


EFAdataframe <- dataframeScaled

if (length(iscontinous) < 2000)
{
  topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
  if (topred < 2) topred <- 2
  
  uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE)  # EFA analysis
  predEFA <- predict(uls,dataframeScaled[,iscontinous])
  EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
  colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous]) 


  
  EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
  
  
    gplots::heatmap.2(abs(EFACor),
                      trace = "none",
    #                  scale = "row",
                      mar = c(5,5),
                      col=rev(heat.colors(5)),
                      main = "EFA Correlation",
                      cexRow = 0.5,
                      cexCol = 0.5,
                       srtCol=45,
                       srtRow= -45,
                      key.title=NA,
                      key.xlab="Pearson Correlation",
                      xlab="Feature", ylab="Feature")
}

2.10 Effect on CAR modeling

par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(rawmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
  }


pander::pander(table(dataframe[,outcome],pr))
  0 1
0 330 106
1 36 208
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.4618 0.4238 0.500
    tp 0.3588 0.3227 0.396
    se 0.8525 0.8016 0.894
    sp 0.7569 0.7138 0.796
    diag.ac 0.7912 0.7587 0.821
    diag.or 17.9874 11.8659 27.267
    nndx 1.6411 1.4474 1.940
    youden 0.6093 0.5154 0.691
    pv.pos 0.6624 0.6072 0.715
    pv.neg 0.9016 0.8664 0.930
    lr.pos 3.5063 2.9474 4.171
    lr.neg 0.1949 0.1435 0.265
    p.rout 0.5382 0.4999 0.576
    p.rin 0.4618 0.4238 0.500
    p.tpdn 0.2431 0.2036 0.286
    p.tndp 0.1475 0.1055 0.198
    p.dntp 0.3376 0.2854 0.393
    p.dptn 0.0984 0.0698 0.134
  • tab:

      Outcome + Outcome - Total
    Test + 208 106 314
    Test - 36 330 366
    Total 244 436 680
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.791 0.759 0.821
3 se 0.852 0.802 0.894
4 sp 0.757 0.714 0.796
6 diag.or 17.987 11.866 27.267

par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe,control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(IDeAmodel,main="IDeA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(IDeAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
  }

pander::pander(table(DEdataframe[,outcome],pr))
  0 1
0 361 75
1 73 171
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.362 0.326 0.399
    tp 0.359 0.323 0.396
    se 0.701 0.639 0.758
    sp 0.828 0.789 0.862
    diag.ac 0.782 0.749 0.813
    diag.or 11.275 7.788 16.324
    nndx 1.891 1.613 2.335
    youden 0.529 0.428 0.620
    pv.pos 0.695 0.633 0.752
    pv.neg 0.832 0.793 0.866
    lr.pos 4.074 3.264 5.085
    lr.neg 0.361 0.297 0.440
    p.rout 0.638 0.601 0.674
    p.rin 0.362 0.326 0.399
    p.tpdn 0.172 0.138 0.211
    p.tndp 0.299 0.242 0.361
    p.dntp 0.305 0.248 0.367
    p.dptn 0.168 0.134 0.207
  • tab:

      Outcome + Outcome - Total
    Test + 171 75 246
    Test - 73 361 434
    Total 244 436 680
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.782 0.749 0.813
3 se 0.701 0.639 0.758
4 sp 0.828 0.789 0.862
6 diag.or 11.275 7.788 16.324

par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
  plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
  text(PCAmodel, use.n = TRUE,cex=0.75)
  ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}

pander::pander(table(PCAdataframe[,outcome],pr))
  0 1
0 345 91
1 90 154
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.360 0.324 0.398
    tp 0.359 0.323 0.396
    se 0.631 0.567 0.692
    sp 0.791 0.750 0.828
    diag.ac 0.734 0.699 0.767
    diag.or 6.487 4.581 9.186
    nndx 2.367 1.922 3.151
    youden 0.422 0.317 0.520
    pv.pos 0.629 0.565 0.689
    pv.neg 0.793 0.752 0.830
    lr.pos 3.024 2.460 3.717
    lr.neg 0.466 0.393 0.553
    p.rout 0.640 0.602 0.676
    p.rin 0.360 0.324 0.398
    p.tpdn 0.209 0.172 0.250
    p.tndp 0.369 0.308 0.433
    p.dntp 0.371 0.311 0.435
    p.dptn 0.207 0.170 0.248
  • tab:

      Outcome + Outcome - Total
    Test + 154 91 245
    Test - 90 345 435
    Total 244 436 680
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.734 0.699 0.767
3 se 0.631 0.567 0.692
4 sp 0.791 0.750 0.828
6 diag.or 6.487 4.581 9.186


par(op)

2.10.1 EFA


  EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
  EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
  pr <- predict(EFAmodel,EFAdataframe,type = "class")
  
  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(EFAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
  }


  pander::pander(table(EFAdataframe[,outcome],pr))
  0 1
0 323 113
1 63 181
  pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.432 0.395 0.471
    tp 0.359 0.323 0.396
    se 0.742 0.682 0.796
    sp 0.741 0.697 0.781
    diag.ac 0.741 0.707 0.774
    diag.or 8.212 5.742 11.746
    nndx 2.072 1.733 2.638
    youden 0.483 0.379 0.577
    pv.pos 0.616 0.557 0.672
    pv.neg 0.837 0.796 0.872
    lr.pos 2.862 2.402 3.410
    lr.neg 0.349 0.280 0.434
    p.rout 0.568 0.529 0.605
    p.rin 0.432 0.395 0.471
    p.tpdn 0.259 0.219 0.303
    p.tndp 0.258 0.204 0.318
    p.dntp 0.384 0.328 0.443
    p.dptn 0.163 0.128 0.204
  • tab:

      Outcome + Outcome - Total
    Test + 181 113 294
    Test - 63 323 386
    Total 244 436 680
  • method: exact

  • digits: 2

  • conf.level: 0.95

  pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.741 0.707 0.774
3 se 0.742 0.682 0.796
4 sp 0.741 0.697 0.781
6 diag.or 8.212 5.742 11.746
  par(op)